library("dplyr")
library("ggplot2")
library("plotly")
## Warning: package 'plotly' was built under R version 4.4.2
library("zoo")
library("tidyr")

1. Read in the Data

library(data.table)
cv_states <- fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
state_pops <- fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv")

state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL

cv_states <- merge(cv_states, state_pops, by = "state")

2. Look at the data

dim(cv_states)
## [1] 58094     9
head(cv_states)
## Key: <state>
##      state       date  fips cases deaths geo_id population pop_density    abb
##     <char>     <IDat> <int> <int>  <int>  <int>      <int>       <num> <char>
## 1: Alabama 2020-03-13     1     6      0      1    4887871    96.50939     AL
## 2: Alabama 2020-03-14     1    12      0      1    4887871    96.50939     AL
## 3: Alabama 2020-03-15     1    23      0      1    4887871    96.50939     AL
## 4: Alabama 2020-03-16     1    29      0      1    4887871    96.50939     AL
## 5: Alabama 2020-03-17     1    39      0      1    4887871    96.50939     AL
## 6: Alabama 2020-03-18     1    51      0      1    4887871    96.50939     AL
tail(cv_states)
## Key: <state>
##      state       date  fips  cases deaths geo_id population pop_density    abb
##     <char>     <IDat> <int>  <int>  <int>  <int>      <int>       <num> <char>
## 1: Wyoming 2023-03-18    56 185640   2009     56     577737    5.950611     WY
## 2: Wyoming 2023-03-19    56 185640   2009     56     577737    5.950611     WY
## 3: Wyoming 2023-03-20    56 185640   2009     56     577737    5.950611     WY
## 4: Wyoming 2023-03-21    56 185800   2014     56     577737    5.950611     WY
## 5: Wyoming 2023-03-22    56 185800   2014     56     577737    5.950611     WY
## 6: Wyoming 2023-03-23    56 185800   2014     56     577737    5.950611     WY
str(cv_states)
## Classes 'data.table' and 'data.frame':   58094 obs. of  9 variables:
##  $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ date       : IDate, format: "2020-03-13" "2020-03-14" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
##  $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : chr  "AL" "AL" "AL" "AL" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "state"

3. Format the data

# Format the date
cv_states$date <- as.Date(cv_states$date, format = "%Y-%m-%d")

# Format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)

# Order the data first by state, second by date
cv_states <- cv_states[order(cv_states$state, cv_states$date), ]

# Confirm the variables are now correctly formatted
str(cv_states)
## Classes 'data.table' and 'data.frame':   58094 obs. of  9 variables:
##  $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date       : Date, format: "2020-03-13" "2020-03-14" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
##  $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
head(cv_states)
##      state       date  fips cases deaths geo_id population pop_density    abb
##     <fctr>     <Date> <int> <int>  <int>  <int>      <int>       <num> <fctr>
## 1: Alabama 2020-03-13     1     6      0      1    4887871    96.50939     AL
## 2: Alabama 2020-03-14     1    12      0      1    4887871    96.50939     AL
## 3: Alabama 2020-03-15     1    23      0      1    4887871    96.50939     AL
## 4: Alabama 2020-03-16     1    29      0      1    4887871    96.50939     AL
## 5: Alabama 2020-03-17     1    39      0      1    4887871    96.50939     AL
## 6: Alabama 2020-03-18     1    51      0      1    4887871    96.50939     AL
tail(cv_states)
##      state       date  fips  cases deaths geo_id population pop_density    abb
##     <fctr>     <Date> <int>  <int>  <int>  <int>      <int>       <num> <fctr>
## 1: Wyoming 2023-03-18    56 185640   2009     56     577737    5.950611     WY
## 2: Wyoming 2023-03-19    56 185640   2009     56     577737    5.950611     WY
## 3: Wyoming 2023-03-20    56 185640   2009     56     577737    5.950611     WY
## 4: Wyoming 2023-03-21    56 185800   2014     56     577737    5.950611     WY
## 5: Wyoming 2023-03-22    56 185800   2014     56     577737    5.950611     WY
## 6: Wyoming 2023-03-23    56 185800   2014     56     577737    5.950611     WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
summary(cv_states)
##            state            date                 fips           cases         
##  Washington   : 1158   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
##  Illinois     : 1155   1st Qu.:2020-12-06   1st Qu.:16.00   1st Qu.:  112125  
##  California   : 1154   Median :2021-09-11   Median :29.00   Median :  418120  
##  Arizona      : 1153   Mean   :2021-09-10   Mean   :29.78   Mean   :  947941  
##  Massachusetts: 1147   3rd Qu.:2022-06-17   3rd Qu.:44.00   3rd Qu.: 1134318  
##  Wisconsin    : 1143   Max.   :2023-03-23   Max.   :72.00   Max.   :12169158  
##  (Other)      :51184                                                          
##      deaths           geo_id        population        pop_density       
##  Min.   :     0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
##  1st Qu.:  1598   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
##  Median :  5901   Median :29.00   Median : 4468402   Median :  107.860  
##  Mean   : 12553   Mean   :29.78   Mean   : 6397965   Mean   :  423.031  
##  3rd Qu.: 15952   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
##  Max.   :104277   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
##                                                      NA's   :1106       
##       abb       
##  WA     : 1158  
##  IL     : 1155  
##  CA     : 1154  
##  AZ     : 1153  
##  MA     : 1147  
##  WI     : 1143  
##  (Other):51184
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2023-03-23"

4. Add new_cases and new_deaths and correct outliers

# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
  cv_subset <- subset(cv_states, state == state_list[i])
  cv_subset <- cv_subset[order(cv_subset$date), ]

  # Add starting level for new cases and deaths
  cv_subset$new_cases <- cv_subset$cases[1]
  cv_subset$new_deaths <- cv_subset$deaths[1]

  # Calculate new cases and new deaths
  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] <- cv_subset$cases[j] - cv_subset$cases[j - 1]
    cv_subset$new_deaths[j] <- cv_subset$deaths[j] - cv_subset$deaths[j - 1]
  }

  # Include in main dataset
  cv_states$new_cases[cv_states$state == state_list[i]] <- cv_subset$new_cases
  cv_states$new_deaths[cv_states$state == state_list[i]] <- cv_subset$new_deaths
}

5. Add additional variables

# Add population-normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths / (cv_states$population / 100000), 1), nsmall = 1))

# Add a naive_CFR variable = deaths / cases
cv_states = cv_states %>%
  dplyr::mutate(naive_CFR = round((deaths * 100 / cases), 2))

# Create a `cv_states_today` variable
cv_states_today = subset(cv_states, date == max(cv_states$date))

II. Scatterplots

6. Explore scatterplots using plot_ly()

### FINISH CODE HERE ###

# pop_density vs. cases
cv_states_today %>%
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5))
## Warning: Ignoring 1 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state != "District of Columbia")

# pop_density vs. cases after filtering
cv_states_today_filter %>%
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5))
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# pop_density vs. deathsper100k
cv_states_today_filter %>%
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5))
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Adding hoverinfo
cv_states_today_filter %>%
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode = 'diameter', opacity = 0.5),
          hoverinfo = 'text',
          text = ~paste(paste(state, ":", sep = ""), 
                        paste(" Cases per 100k: ", per100k, sep = ""), 
                        paste(" Deaths per 100k: ", deathsper100k, sep = ""), sep = "<br>")) %>%
  layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
         yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
         hovermode = "compare")
## Warning: Ignoring 1 observations
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

7. Explore scatterplot trend interactively using ggplotly() and geom_smooth ()

### FINISH CODE HERE ###

# Create a scatterplot with ggplot for pop_density vs. deathsper100k
p <- ggplot(cv_states_today_filter, aes(x = pop_density, y = deathsper100k, size = population)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(
    title = "Relationship Between Population Density and Deaths per 100k",
    x = "Population Density",
    y = "Deaths per 100k"
  )

ggplotly(p)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: size.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

It appears that as population density increases, deaths per 100k increases. It would appear that it population density is a correlate of newdeathsperweek100k.

8. Multiple line chart

### FINISH CODE HERE ###

# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, 
        type = "scatter", mode = "lines", 
        colors = RColorBrewer::brewer.pal(12, "Set3"))
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
  filter(state == "Florida") %>%
  plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>%
  add_trace(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")
cv_states %>% 
  filter(state == "Florida") %>% 
  plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines", name = "New Cases", 
          line = list(color = "blue")) %>% 
  add_trace(y = ~new_deaths, name = "New Deaths", line = list(color = "red")) %>%
  layout(
    title = "New Cases and Deaths in Florida Over Time",
    xaxis = list(title = "Date"),
    yaxis = list(title = "Count"),
    legend = list(title = list(text = "Metric"))
  )

It seems that there is about a 2 month delay between new cases spiking and deaths spiking.

9. Heatmaps

### FINISH CODE HERE ###

### Map state, date, and new_cases to a matrix
cv_states_mat <- cv_states %>%
  select(state, date, new_cases) %>%
  dplyr::filter(date > as.Date("2021-06-15"))

cv_states_mat2 <- as.data.frame(
  pivot_wider(cv_states_mat, names_from = state, values_from = new_cases, values_fill = 0)
)
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Replace any remaining NA values with 0
cv_states_mat2[is.na(cv_states_mat2)] <- 0

# Verify the range of the data
range(cv_states_mat2, na.rm = TRUE)
## [1] -18255 227972
# Create the heatmap
plot_ly(x = colnames(cv_states_mat2), 
        y = rownames(cv_states_mat2),
        z = ~cv_states_mat2,
        type = "heatmap",
        showscale = TRUE)
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by = "2 weeks")

cv_states_mat <- cv_states %>%
  select(state, date, new_cases) %>%
  filter(date %in% filter_dates)

cv_states_mat2 <- as.data.frame(
  pivot_wider(cv_states_mat, names_from = state, values_from = new_cases, values_fill = 0)
)
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

plot_ly(x = colnames(cv_states_mat2), 
        y = rownames(cv_states_mat2),
        z = ~cv_states_mat2,
        type = "heatmap",
        showscale = TRUE)

10. Map

### FINISH CODE HERE ###

pick.date <- "2021-10-15"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>%
  filter(date == pick.date) %>%
  select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>',
                                         "Cases per 100k: ", newper100k, '<br>',
                                         "Cases: ", cases, '<br>',
                                         "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Make sure both maps are on the same color scale
shadeLimit <- 125

# Create the map
fig_pick.date <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  ) %>%
  colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0, shadeLimit)) %>%
  layout(
    title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
    geo = set_map_details
  )

### Map for today's date
cv_per100 <- cv_states_today %>%
  select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>',
                                         "Cases per 100k: ", newper100k, '<br>',
                                         "Cases: ", cases, '<br>',
                                         "Deaths: ", deaths))

# Create the map
fig_Today <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  ) %>%
  colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0, shadeLimit)) %>%
  layout(
    title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
    geo = set_map_details
  )

### Plot together
subplot(fig_pick.date, fig_Today, nrows = 2, margin = 0.05)

So the naive case fatality rate is a lot lower in most states in 2024. California seemed to increase as well as Ohio and Illinois. Overall, the US is shaded more lightly indicating lower case fatality rates.